clear all 
set more off 
set maxvar 15000 
clear matrix

/*-----------------------------------------------------------------------------
	Purpose: Clean and code all cross sections of NLSY66 from 1966-1981.
	Creates: nlsym66_newxsec.dta

	Note: In order to observe family income around 40 for most respondents,
	      the 1981 (cleaned) cross section is selected for main analysis. As 
	      the median age in 1966 cross section is 18, respondents would have 
	      reached a median age of 40 around 1988. The closest survey year to 
	      1988 is 1981.
----------------------------------------------------------------------------- */

cd "$Mydirectory1/1_DataSources/NLSYM66/"

* Import supplementary data and save dta files
 quietly run "./rawdata/hhsize-value-labels.do"
 quietly run "./rawdata/Rwho_livewith_age14-value-labels.do"
 quietly run "./rawdata/classofwrker_nlsym-value-labels.do"
 quietly run "./rawdata/sibs-value-labels.do"

* Import data and run NLS-provided clean up
run "./rawdata/NLSYM66-ISM-value-labels.do"
sort R0000100_1966

*------------------------------------------------------------------------ 
*	 Merge in other relevant variables
*------------------------------------------------------------------------ 
	* Household size 
		merge 1:1 R0000100_1966 using "./rawdata/hhsize.dta"
		drop _merge
	
	* Who R lived with at age 14
		merge 1:1 R0000100_1966 using "./rawdata/Rwho_livewith_age14.dta"
		drop _merge
	
	*Class of worker (i.e., self-employed, private, gov't, without pay)
		merge 1:1 R0000100_1966 using "./rawdata/classofwrker_nlsym.dta"
		drop _merge

	*Sibling ID code
		merge 1:1 R0000100_1966 using "./rawdata/sibs"
		drop _merge

*------------------------------------------------------------------------ 
*	 Create dummy variable for 1981 respondents 
*------------------------------------------------------------------------ 
	gen interviewed1981=R0805800_1981 !=-5

*------------------------------------------------------------------------ 
*	 RENAME AND CLEAN VARIABLES
*------------------------------------------------------------------------ 
*----------------------
*	Interview information	
*----------------------
	* Interview month (available all years)
		renvars R0002101_1966 R0063511_1967 R0115011_1968 R0173605_1969 R0231500_1970 R0302100_1971 R0392100_1973 ///
				R0412700_1975 R0437501_1976 R0547700_1978 R0596400_1980 R0683200_1981   ///
				, map("doim" + substr("@", -4,.)) 
			qui:mvdecode doim*, mv(-5 -4 -3 -2 -1)
	
	* Interview year--get from reshape
	
	* Cross-sectional sample weights 
		renvars R0000200_1966 R0064500_1967 R0116910_1968 R0175110_1969 R0234300_1970 R0305910_1971 R0392510_1973 ///
				R0413010_1975 R0439000_1976 R0546810_1978 R0596710_1980 R0683510_1981  ///
				, map("weight" + substr("@", -4,.))  
			qui:mvdecode weight*, mv(-5 -4 -3 -2 -1)
			sum  weight*   

*-------------------
*	Id variables
*-------------------
	* Identification codes
		rename R0000100_1966 id
		rename R0000101_1966 random_id
		rename R0000300_1966 hhid
		qui: mvdecode id random_id hhid, mv(-5 -4 -3 -2 -1) 

	* ID links to other surveys
		rename R0000352_1966 oldmen_id
		rename R0000351_1966 oldwomen_id
		qui:mvdecode oldmen_id	oldwomen_id	, mv(-5 -4 -3 -2 -1)
		sum id random_id hhid oldwomen_id oldmen_id

*----------------
* Unions 
*----------------
	/* Note: (1) Available 1969-1981.
	         (2) Survey question is: "Are wages on 
	             current job set by collective bargaining?"
	*/

		renvars R0187700_1969 R0246800_1970 R0320200_1971 R0455800_1976 R0554900_1978 R0602000_1980  R0693200_1981 ///			
				, map("union" + substr("@", -4,4)) 
		qui:mvdecode union*, mv(-5 -4 -3 -2 -1)
  
*----------------
* Demographics
*----------------

*Respondent
	*Sex 
		gen sex=1
		gen male=1

	* Birth year
		rename R0002203_1966 doby
			qui:mvdecode doby, mv(-5 -4 -3 -2 -1)
			replace doby=1900+doby

	* Birth month
		rename R0002202 dobm
			qui:mvdecode dobm, mv(-5 -4 -3 -2 -1)
	* Race
		/* Note: (1) See https://www.nlsinfo.org/content/cohorts/older-and-young-men/topical-guide/household/race-ethnicity-and-nationality
		             for more info on race and ethnicity variables in NLSYM.
                 (2) Mexicans, Puerto Ricans, and other Latin Americans coded as "white".
                 (3) Japanese, Chinese, Native American, Korean, Hindu, Eskimo, coded as "."
        */
		rename R0002300_1966 race
		gen black=race==2
		qui:mvdecode race, mv(-5 -4 -3 -2 -1)
		fre race black
	
	* Age at interview--calculated after reshape occurs

	* Marital status
		renvars R0002400_1966 R0063500_1967 R0115000_1968 R0173600_1969 R0234100_1970 R0305800_1971 R0399900_1973 ///
				R0421800_1975 R0507300_1976 R0548100_1978 R0596800_1980 R0682400_1981  ///			
			, map("marital_status" + substr("@", -4,.)) 
		qui:mvdecode marital_status*, mv(-5 -4 -3 -2 -1)	
		sum marital_status*

	* Relationship to head--not available

	* Region of residence (during first survey year)
		clonevar  grewup_south = R0002451_1966

	* R's current region of residence
		renvars R0002451_1966 R0063551_1967 R0115051_1968 R0173651_1969 R0232351_1970 R0303051_1971 R0393252_1973  ///
				R0413751_1975 R0437511_1976 R0547530_1978 R0640400_1980 R0805800_1981  ///
			, map("region_residence" + substr("@", -4,.)) 
		qui:mvdecode region_residence*, mv(-5 -4 -3 -2 -1)	
		sum region_residence*

	*Foreign Born
		rename R0038100_1966 cntrybirth1966
		qui: mvdecode cntrybirth*, mv(-5 -4 -3 -2 -1)
		gen foreignborn = .
		replace foreignborn =1 if cntrybirth1966==5
		replace foreignborn=0 if cntrybirth1966!=5 & !mi(cntrybirth1966)
		sum foreignborn	

*Parents
	* Who R lived with when R was 14
		rename R0039700_1966 Rwho_livew_age14
		qui:mvdecode Rwho_livew_age14, mv(-5 -4 -3 -2 -1)
	
	* Foreign born
		gen fatherforeign= R0039000_1966!=1 & !mi(R0039000_1966)
		gen motherforeign= R0039100_1966!=1 & !mi(R0039100_1966)

*-------------------------------
* Education
*------------------------------- 

*Respondents
	* Highest Grade Completed
		renvars R0410000_1973 R0434100_1975 R0543940_1976, map("hgc" + substr("@", -4,4))
		mvdecode hgc* , mv(-5 -4)
		sum hgc*

*Parents
	*Highest Grade Completed by father (available in 1966)
		rename R0063100_1966 hgc_dad

	* Highest Grade Completed by mother (available in 1966)
		rename R0063300_1966 hgc_mom

		qui:mvdecode hgc_dad hgc_mom , mv(-5 -4 -3 -2 -1)
		fre  hgc_dad hgc_mom 	

*-------------------------------
* HH size
*------------------------------- 

	renvars R0369800_1971 R0563900_1978 R0611700_1980 R0744000_1981, map("R_hhsize_plusR"+ substr("@",-4,4))
	
	foreach num of numlist 71 78 80 81 {
		tab R_hhsize_plusR19`num', m
	}
	mvdecode R_hhsize_plusR* , mv(-5 -4 -2 -1) 

	//TOPCODED VALUES
	/* Note: In the years below, household size should be topcoded at 16, 
	         but for some reason there is a "17" value. In other surveys, 
	         any values exceeding the topcoded value have been recoded to have the 
	         topcoded value. The same will be done here.*/
	replace R_hhsize_plusR1971 =16 if R_hhsize_plusR1971==17 

*-------------------------------
* Siblings
*------------------------------- 

	foreach var of varlist R0000301_1966 R0000302_1966 R0000303_1966 R0000304_1966 R0000305_1966  {
		if "`var'"=="R0000301_1966" {
			local num 1
			local lab "1st"
		}
		if "`var'"=="R0000302_1966" {
			local num 2
			local lab "2nd"
		}
		if "`var'"=="R0000303_1966" {
			local num 3
			local lab "3rd"
		}
		if "`var'"=="R0000304_1966" {
			local num 4
			local lab "4th"
		}
		if "`var'"=="R0000305_1966" {
			local num 5
			local lab "5th"
		}

		ren `var' idcode_brother`num'_ym
		label var idcode_brother`num' "ID code of `lab' brother in NLSYM (taken from YM survey)"
	}

	foreach var of varlist  R0000306_1966 R0000307_1966 R0000308_1966 R0000309_1966 R0000310_1966 {
		if "`var'"=="R0000306_1966" {
			local num 1
			local lab "1st"
		}
		if "`var'"=="R0000307_1966" {
			local num 2
			local lab "2nd"
		}
		if "`var'"=="R0000308_1966" {
			local num 3
			local lab "3rd"
		}
		if "`var'"=="R0000309_1966" {
			local num 4
			local lab "4th"
		}
		if "`var'"=="R0000310_1966" {
			local num 5
			local lab "5th"
		}

		ren `var' idcode_sister`num'_ym
		label var idcode_sister`num' "ID code of `lab' sister in NLSYW (taken from YM survey)"
	}

	* Dummy: R has a sibling in nlsym or nlsyw surveys
	lookfor idcode
    di "`r(varlist)'"
    local vars "`r(varlist)'"

    gen sib_nlsym66 =.

    foreach v of local vars {
        di "`v'"
        replace sib_nlsym66 =1 if `v'!=-4 & sib_nlsym66==.
    }

    replace sib_nlsym66 =0 if sib_nlsym66==.
    tab sib_nlsym66, m

*-------------------------------
* Employment 
*-------------------------------

*Respondents
	ren R0810600_1981 class_wrker_1981
	mvdecode class_wrker_1981, mv(-5 -4) 
	
*Father--not available

*-------------------------------
* Occupation 
*-------------------------------
*Respondents
	/* Note: There are two variables for "current/last job (3-digit Census code)". 
	         Will combine both to recover the most respondents. */

	    * Version 1: uncollapsed
		renvars R0019600_1966 R0074200_1967 R0127000_1968 R0186500_1969 R0245600_1970 R0397700_1973 R0419700_1975 ///
				R0452700_1976 R0552400_1978	R0599500_1980 R0688700_1981				/// 
			, map("initoccu" + substr("@", -4,4)) 
		qui:mvdecode initoccu* , mv(-5 -4 -3 -2 -1)	
		sum initoccu*

	    * Version 2: collapsed
		renvars R0111000_1967 R0165700_1968 R0222200_1969 R0297900_1970 R0385800_1971 R0410700_1973 R0434800_1975  ///
				R0544400_1976 R0590600_1978	R0681300_1980 R0811000_1981		/// 
			, map("finoccu" + substr("@", -4,4)) 
		qui:mvdecode finoccu* , mv(-5 -4 -3 -2 -1)		
		sum finoccu*

		* Create occupation variable for respondent
		gen finoccu1966=.
		forvalues y = 1966/1981{
			capture confirm variable finoccu`y'
			if !_rc {
				gen occu`y'=finoccu`y'
				capture confirm variable initoccu`y'
				if !_rc {
					replace occu`y'=initoccu`y' if mi(occu`y') & !mi(initoccu`y')
				}
			}
		}
		sum initoccu1969 finoccu1969 occu1969

*Parents 
	* Occupation of father when R was age 14 (available in 1966)
		clonevar occu_dad_r14=R0039800_1966
		qui:mvdecode occu_dad_r14 , mv(-2 -1)

	* Occupation of mother when R age 14 (available in 1966)
		/* Note: "Occupation of dad" variable is actually 
		         "occupation of head of household when R was 
		         14". Will use this variable + info on who
		         R grew up with to obtain mother occupation 
		         for some respondents. */
	
*----------------
* Family Income
*----------------

*Respondents
	renvars R0062410_1966 R0114110_1967 R0167810_1968 R0224610_1969 R0301710_1970 R0391260_1971 R0408610_1973  ///
			R0431710_1975 R0545510_1976 R0590310_1978 R0638010_1980 R0811710_1981 ///
	, map("famincv2_" + substr("@", -4,.))

	forvalues y = 1966/1981{
	capture confirm variable famincv2_`y'
		if !_rc {
			replace famincv2_`y'=. if famincv2_`y'<0
		}
	}
	sum famincv2_*
	
	foreach y in 1981 {
	capture confirm variable famincv2_`y'
		if !_rc {
			rename famincv2_`y' c_famincv2_`y' 
			gen famincv_`y'=.
			replace famincv_`y'=0.75*2000 if  inrange(c_famincv2_`y',0,1999)
			replace famincv_`y'=4000 if  inrange(c_famincv2_`y',2000  ,5999 )
			replace famincv_`y'=8000 if  inrange(c_famincv2_`y',6000  ,9999 )
			replace famincv_`y'=12500 if  inrange(c_famincv2_`y',10000  ,14999 )
			replace famincv_`y'=17500 if  inrange(c_famincv2_`y',15000  ,19999 )
			replace famincv_`y'=22500 if  inrange(c_famincv2_`y',20000  ,24999 )
			replace famincv_`y'=27500 if  inrange(c_famincv2_`y',25000  ,29999 )
			replace famincv_`y'=32500 if  inrange(c_famincv2_`y',30000  ,34999 )
			replace famincv_`y'=37500 if  inrange(c_famincv2_`y',35000  ,39999 )
			replace famincv_`y'=45000 if  inrange(c_famincv2_`y',40000  ,49999 )
			replace famincv_`y'=1.25*50000 if  c_famincv2_`y'>=50000 & !mi(c_famincv2_`y')	
			replace famincv_`y' =. if interviewed1981!=1
		
		gen bottomcoded_son_v`y'=1 if famincv_`y'==0.75*2000
		gen topcoded_son_v`y'=1 if famincv_`y'==1.25*50000

		tab famincv_`y'  if interviewed1981==1
		}	
	}

* Keep only individuals with year of birth information
	drop if mi(doby) 

*------------------------------------------------------------------------ 
*					RESHAPE TO LONG
*------------------------------------------------------------------------
	local intvars="doim*  weight* id random_id hhid oldmen_id oldwomen_id union* interviewed1981"
	local demovars="sex male doby dobm race black marital_status* region_residence* grewup_south  foreignborn fatherforeign motherforeign"
	local edu="hgc_dad hgc_mom hgc*"
	local occuvars="class_wrker_1981 occu* occu_dad_r14" 
	local faminvars="famincv_1981 bottomcoded_son_v1981 topcoded_son_v1981 Rwho_livew_age14 R_hhsize_plusR* sib_nlsym66 idcode_*"
	keep  id `intvars' `demovars' `edu' `occuvars' `faminvars'

	qui: reshape long doim weight marital_status region_residence hgc occu famincv_  ///
						topcoded_son_ bottomcoded_son_ union R_hhsize_plusR  , i(id) j(doiy)
						
	replace topcoded_son_=0 if topcoded_son_==.
	replace bottomcoded_son_=0 if bottomcoded_son_==.

	rename topcoded_son_ topcoded_son
	label var topcoded_son "Son's family income top coded"
	rename bottomcoded_son_ bottomcoded_son
	label var bottomcoded_son "Son's family income bottom coded"

	ren famincv_ famincv_1981
	label var famincv_1981 "R's Family Income, continuous (based on midpoints of continuous ranges)" 

	* Survey year
	gen year=doiy
	label var year "Survey year" 

*------------------------------------------------------------------------ 
*						GENERATE NEW VARS
*------------------------------------------------------------------------
	des
*----------------------
* Interview information	
*----------------------
	sum doim doiy weight 

*-----------
* Id vars 
*-----------
	unique id 
	unique id year  
	sum id

*----------------
* Unions 
*----------------
	rename union unionR
	lab var unionR "Wages set by union"
	lab define yesno 1 "Yes" 0 "No"
	lab values unionR yesno

*----------------
* Veterans 
*----------------

	gen veteran=.

*----------------
* Demographics
*----------------
*Respondents
	*Sex
		fre sex 
		lab var sex "R's sex"	

	*Age 
		/*Note: When birth month is missing, 
	            the median will be imputed.
		        Same for interview month. */
		sum dobm doim, d
		replace dobm=7 if mi(dobm)   
		replace doim=11 if mi(doim)  

		gen age = ym(doiy, doim) - ym(doby, dobm) 
		replace age = floor(age/12) 

		gen agesq = age * age

	*Birth cohorts	
		clonevar dob=doby 
		tab dob, m
		label var dob "Year of birth"

	*Birth decade
		gen decade = 10 * floor(dob/10)
		tab decade,m
		label var decade "Decade of birth"

		//Generate dummies for each decade
		tab decade, gen(decade_)

	* Race
		fre race
		replace race=. if race==3
		lab define race 1 "white" 2 "black" 
		lab values race race

	* Foreign born
		fre foreignborn
		lab var foreignborn "R was born outside the US"

	* Marital status
		fre marital_status
		gen married=inlist(marital_status,1,2) if marital_status<.
		tab married, m
		
		gen widowed = (marital_status==3) if marital_status<.
		tab widowed, m
		
		gen divorced = (marital_status==4) if marital_status<.
		tab divorced, m 
		
		gen never_married = (marital_status==6) if marital_status<.
		tab never_married, m
		
		gen separated = (marital_status==5) if marital_status<.
		tab separated, m 
		
*Parents
	*Foreign born
		fre fatherforeign
		lab var fatherforeign "R's father was born outside the US"

		fre motherforeign
		lab var motherforeign "R's mother was born outside the US"

*-------------------------------
* Education
*-------------------------------

*Respondents
	* Max educational level
		bysort id: egen yrsschool = max(hgc)  
		rename hgc edu_73_76 
		clonevar hgc=yrsschool // Highest grade/yr compltd
		
		label var yrsschool "Years of school" 	
		tab yrsschool, m nol 

	* Categorical edu var
		fre hgc 
		gen eduR= 0 if hgc==0 
		replace eduR=1 if hgc>=1 & hgc<8  /* some grade school */ 
		replace eduR=2 if hgc==8  /* completed 8th grade */ 
		replace eduR=3 if hgc>8 & hgc<12  /* some HS */ 
		replace eduR=4 if hgc==12 /* 4 years of HS */ 
		replace eduR=5 if hgc>12 & hgc<16  /* 1-3 years of college */ 
		replace eduR=6 if hgc>15 & hgc<.  /* 4 or more years of college (like BA) */ 
		tab eduR, m

		* Years of school (binned)
		gen yrsschool_bin=. 
		replace yrsschool_bin = 0 if yrsschool==0 
		replace yrsschool_bin = 6 if yrsschool>0 & yrsschool<8  
		replace yrsschool_bin = 8 if yrsschool==8 
		replace yrsschool_bin = 10 if yrsschool>8 & yrsschool<12 
		replace yrsschool_bin = 12 if yrsschool==12 
		replace yrsschool_bin = 14 if yrsschool>12 & yrsschool<16 
		replace yrsschool_bin = 16 if yrsschool>=16 & yrsschool<20 
		tab yrsschool_bin , m
		label var yrsschool_bin "Years of school, binned" 

	* High school degree
		gen hs_ed = eduR>=4 if !mi(eduR)
		tab hs_ed, m
		label var hs_ed "HS educated"

	* College education degree
		gen coll_ed = eduR>=6  if  !mi(eduR)
		tab coll_ed, m 
		label var coll_ed "Coll educated" 

*Parents
	fre  hgc_mom hgc_dad
	clonevar dad_ed_raw=hgc_dad
	clonevar mom_ed_raw=hgc_mom	

		foreach name in mom dad { 
			gen edu_`name'=0 if `name'_ed_raw==0 //none
			replace edu_`name'=1 if `name'_ed_raw>0 & `name'_ed_raw<8 //some grade school
			replace edu_`name'=2 if `name'_ed_raw==8 //completed 8th grade
			replace edu_`name'=3 if `name'_ed_raw>8 & `name'_ed_raw<12 //some HS 
			replace edu_`name'=4 if `name'_ed_raw==12 //4 years of HS
			replace edu_`name'=5 if `name'_ed_raw>12 & `name'_ed_raw<16 //some college
			replace edu_`name'=6 if `name'_ed_raw==16 | `name'_ed_raw==17  //college
			replace edu_`name'=4 if `name'_ed_raw==25 
			label var edu_`name' "Educational categories for `name'" 
		} 

	* Binned
		foreach name in mom dad { 
			gen edu_`name'_bin=0 if edu_`name'==0 
			replace edu_`name'_bin=6 if edu_`name'==1 
			replace edu_`name'_bin=8 if edu_`name'==2 
			replace edu_`name'_bin=10 if edu_`name'==3 
			replace edu_`name'_bin=12 if edu_`name'==4 
			replace edu_`name'_bin=14 if edu_`name'==5 
			replace edu_`name'_bin=16 if edu_`name'==6 
			tab edu_`name'_bin, m
			label var edu_`name'_bin "`name' Years of school from bins"

		* High school degree		
			gen `name'_hs_ed = edu_`name'>=4 if edu_`name'<.
			tab `name'_hs_ed, m

		* College education degree
			gen `name'_coll_ed = edu_`name'>=6 if edu_`name'<.
			tab `name'_coll_ed, m 
			label var `name'_hs_ed "`name' HS educated" 
			label var `name'_coll_ed "`name' College educated"
		}

	* Years of school		
		foreach a in dad mom {
			rename `a'_ed_raw yrsschool_`a'
			tab yrsschool_`a', m
		} 
	
*-------------------------------
* HH size
*------------------------------- 

	gen R_hhsize_minusR = R_hhsize_plusR -1
	tab R_hhsize_minusR,m 
	
	label var R_hhsize_plusR "total # of persons in R's hh (including R)"
	label var R_hhsize_minusR "total # of persons in R's hh (NOT including R)"	

*-------------------------------
* Employment 
*-------------------------------
*Respondents
	gen selfemployed = (class_wrker_1981==4) if class_wrker_1981!=.
	tab selfemployed,m 

*-------------------------------
* Occupation 
*-------------------------------
	sum occu occu_dad_r14

	/*Note: 1960 Census occupation codes are used for years 
	        1966-1981 for respondent and parental occupation
	        variables. */
	
	clonevar occu1960=occu 

*-----------------------------------
* Crosswalk occupations (parents) 
*-----------------------------------
	clonevar census1960=occu_dad_r14
	
	merge m:1 census1960 using "../Crosswalks/Crosswalk_1960Census_toANES.dta"
		tab census1960 if _merge==1, m
		assert (census1960==. | census1960==-4 | census1960==995) if _merge==1 
		drop if _merge==2
		drop _merge
	
	/* Fix fatheroccej so that only respondents with a father or male head 
	   of household receive a value. */
	gen fatheroccej_nochange = fatheroccej 
	label var fatheroccej_nochange "fatheroccej but unchanged since merge with coarsened anes occs"
	
	replace fatheroccej =. if Rwho_livew_age14==5 | Rwho_livew_age14>=7
	tab fatheroccej, m
	assert fatheroccej==. if fatheroccej_nochange==. | Rwho_livew_age14==5 |  Rwho_livew_age14>=7 

	//Obtain mother occupation from head of household occupation var (fatheroccej_nochange) 
	   /* Note: Only respondents who grew up with a mom (no father present)
	            or another female head of household will receive a non-missing value. */
	gen motheroccej =.
	replace motheroccej = fatheroccej_nochange if inlist(Rwho_livew_age14,5,7)
	tab motheroccej,m 
	assert motheroccej==. if fatheroccej_nochange==. | !inlist(Rwho_livew_age14,5,7) 

	//Rename fatheroccej so that respondent occupations can be crosswalked to coarsened ANES occupations
	rename fatheroccej father_occ1950ej 
	rename fatheroccej_nochange father_occ1950ej_nochange 
	
	drop census1960 

********************************************************************************************************
	* For fathers/mothers: Fix occupations for self-employed businessmen, managers, or officials before merging income scores * 
*******************************************************************************************************
/* Note: Not able to find class of worker for R's mother or father
         when R was growing up. */	

*----------------------------------------
* Crosswalk occupations (respondents) 
*----------------------------------------
	clonevar census1960=occu1960 

	merge m:1 census1960 using "../Crosswalks/Crosswalk_1960Census_toANES.dta"
		assert census1960==. if _merge==1
		drop if _merge==2
		drop _merge 
		
	rename fatheroccej occej_from1960 		
	capture drop fatheroccej 

	gen fatheroccej=.
	replace fatheroccej=occej_from1960 

********************************************************************************************************
* For respondents: Fix occupations for self-employed businessmen, managers, or officials before merging income scores * 
*******************************************************************************************************
	replace fatheroccej=21 if fatheroccej==28 & selfemployed==1 

	//Rename respondent and father occupations
	rename fatheroccej occRej 
	rename father_occ1950ej fatheroccej 
	rename father_occ1950ej_nochange fatheroccej_nochange 
	
*------------------------------------------
* Head of hh dummies + father_imputed dummy	
*------------------------------------------
	
	gen headofhh_father = (inrange(Rwho_livew_age14,1,4)) if Rwho_livew_age14<.
	tab headofhh_father, m
	
	gen headofhh_mother = (Rwho_livew_age14==5) if Rwho_livew_age14<.
	tab headofhh_mother, m
	
	gen headofhh_othermale = (Rwho_livew_age14==6) if Rwho_livew_age14<.
	tab headofhh_othermale,m 
	
	gen headofhh_otherfemale = (Rwho_livew_age14==7) if Rwho_livew_age14<.
	tab headofhh_otherfemale,m 
	
	label var headofhh_father "Head of hh when R was growing up was R's father"
	label var headofhh_othermale "Head of hh when R was growing up was some other male (not R's father)"
	label var headofhh_otherfemale "Head of hh when R was growing up was some other female (not R's mother)"
	label var headofhh_mother "Head of hh when R was growing up was R's mother"
	
	//Create alternate dummy for hh head being R's father. 
	     /* Note: When R reports occupation of their father but 
	              does not specify who they lived with when
	              growing up, will assume that R lived with father.
	     */	
	gen headofhh_father_imputed = headofhh_father
	replace headofhh_father_imputed =1 if fatheroccej_nochange!=. & Rwho_livew_age14==.
	tab headofhh_father_imputed,m 
	tab headofhh_father, m
	label var headofhh_father_imputed "Impute dad when parent occ != missing & no info about hh head at age 16"	
	
*-----------------------------------------------------------------------------------------------------
*  DUMMIES FOR WHEN WE KNOW WHY DAD OR MOM DIDN'T WORK (I.E. WHY FATHEROCCEJ/MOTHEROCCEJ IS MISSING)
*-----------------------------------------------------------------------------------------------------

	gen father_notworking =.	
	replace father_notworking =0 if fatheroccej!=.
	tab father_notworking, m 
	
	gen mother_notworking =.
	replace mother_notworking =0 if motheroccej!=.
	tab mother_notworking, m 
	
	/*Note: No apparent skip-logic to parental occupation 
	        questions. */

* Variable for father being either farm laborer or operator
	gen fatherfarm=0
	replace fatherfarm=. if fatheroccej==.
	replace fatherfarm=1 if fatheroccej==71 | fatheroccej==81
	
*------------------------------------------------------------------------ 
* Save a dataset here for Appendix E exercise (drop table)
*------------------------------------------------------------------------ 

	save "./wrkdata/nlsym66_4droptable.dta", replace

*------------------------------------------------------------------------ 
* Jácome et al. vs Mazumder et al benchmarking exercise
*------------------------------------------------------------------------ 
	preserve
		ren id id_son
		keep id_son fatheroccej motheroccej father_notworking grewup_south dob
		sort id_son

		//Preliminary step: confirm that fatheroccej, motheroccej, and grewup_south are constant by id
		foreach var of varlist fatheroccej motheroccej father_notworking grewup_south dob	{
			if "`var'"=="fatheroccej" local num 1
			if "`var'"=="motheroccej" local num 2
			if "`var'"=="grewup_south" local num 3
			if "`var'"=="dob" local num 4 
			if "`var'"=="father_notworking" local num 5 

			egen tag`num' = tag(id_son `var'), missing //Tag each distinct value of var, by unique id
			by id_son: egen tag`num'_total = total(tag`num') 
			tab tag`num'_total, m //Confirmed: only 1 value of each variable for each unique id

			ren `var' `var'_nlsym66
		}
		drop tag*
		
		//Keep only one record for each unique id 
		bysort id_son: keep if _n==1
		count 
		save "./wrkdata/MD_benchmarkingexercise_nlsym66.dta", replace
	restore

*------------------------------------------------------------------------ 
*		OUR SAMPLE SELECTION (to harmonize across datasets)
*------------------------------------------------------------------------ 
	
* Weights from 1966
	tempvar temp
	qui:bys id: gen `temp'=weight if year==1966
	qui:bys id: egen wgt=min(`temp')

* Native-born restriction
	keep if (foreignborn==0 | foreignborn==.)
	rename weight weight_nlsym66
	sort  id year
	save "./wrkdata/nlsym66_foranalysis.dta", replace

*------------------------------------------------------------------------ 
*		Create a cross section
*------------------------------------------------------------------------ 
use "./wrkdata/nlsym66_foranalysis.dta", clear
capture drop __*

* More restrictions
	sort id year
	keep if year==1981 & inrange(age,30,50)
	keep if interviewed1981==1
	
	unique id  
	duplicates list id  //0 duplicates
	
	rename region_residence south_residence
	
* Convert family income variable to 1950 dollars
	gen year_CPI = year-1
	merge m:1 year_CPI using "../CPI/CPI_deflator.dta"
	drop if _merge==2
	drop _merge
	
	gen fam_inc_real =.
	replace fam_inc_real = famincv_1981 * deflator // deflator =(CPI1950/CPI) 
	label var fam_inc_real "Family income (bins), in 1950 dollars"

	gen lnfaminc = ln(fam_inc_real)
	label var lnfaminc "Logged family income (bins)"

	drop CPI year_CPI deflator
	
capture drop __*
save "./wrkdata/nlsym66_newxsec.dta", replace
unique id
